import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#plt.style.use('seaborn-notebook')
import sklearn
# make sure to use the latest version of pandas and scikit-learn
if (pd.__version__ != "1.1.4") or (sklearn.__version__ != "0.23.2"):
print("Warning: you are not using the latest modules")
# set path to data-files
p2data = '/Users/froux/Documents/data/'
fnames = ['user1.txt','user2.txt','user3.txt','user4.txt']
# loop over file names and concatenate loaded data into dataframe
dat = pd.DataFrame()
for f,ix in zip(fnames,list(range(len(fnames)))):
tmp = pd.read_csv(p2data+f,sep=',')
print(f,"has",tmp.shape[0],"rows")
tmp['user'] = ix+1
dat = pd.concat([dat,tmp],axis=0,ignore_index=True)
user1.txt has 20266 rows user2.txt has 60256 rows user3.txt has 26694 rows user4.txt has 30608 rows
# inspect the resulting dataframe
print(dat.shape)
print(dat.columns)
dat
(137824, 9)
Index(['timestamp', 'date_id', 'dow', 'utc_time_sec', 'offset_time_sec',
'accuracy', 'longitude', 'latitude', 'user'],
dtype='object')
| timestamp | date_id | dow | utc_time_sec | offset_time_sec | accuracy | longitude | latitude | user | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 2015-03-01 07:16:39 | 20150301 | Sun | 1425190599 | -3600 | 10.000000 | -64.979205 | -15.687428 | 1 |
| 1 | 2015-03-01 07:17:12 | 20150301 | Sun | 1425190632 | -3600 | 29.000000 | -64.979196 | -15.687460 | 1 |
| 2 | 2015-03-01 07:17:25 | 20150301 | Sun | 1425190645 | -3600 | 33.000000 | -64.979229 | -15.687334 | 1 |
| 3 | 2015-03-01 07:18:19 | 20150301 | Sun | 1425190699 | -3600 | 25.000000 | -64.979197 | -15.687459 | 1 |
| 4 | 2015-03-01 07:18:25 | 20150301 | Sun | 1425190705 | -3600 | 17.677000 | -64.979197 | -15.687459 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 137819 | 2015-04-01 00:08:25 | 20150401 | Wed | 1427839705 | -3600 | 776.625977 | -64.609553 | -15.031567 | 4 |
| 137820 | 2015-04-01 00:09:29 | 20150401 | Wed | 1427839769 | -3600 | 872.018982 | -64.609553 | -15.031567 | 4 |
| 137821 | 2015-04-01 00:10:30 | 20150401 | Wed | 1427839830 | -3600 | 963.453003 | -64.609553 | -15.031567 | 4 |
| 137822 | 2015-04-01 00:11:31 | 20150401 | Wed | 1427839891 | -3600 | 1326.000000 | -64.609553 | -15.031567 | 4 |
| 137823 | 2015-04-01 00:12:38 | 20150401 | Wed | 1427839958 | -3600 | 1326.000000 | -64.609553 | -15.031567 | 4 |
137824 rows × 9 columns
# define some functions to toggle dow between str and int
def dow2num(dow):
if dow =='Mon':
return(1)
if dow =='Tue':
return(2)
if dow =='Wed':
return(3)
if dow =='Thu':
return(4)
if dow =='Fri':
return(5)
if dow =='Sat':
return(6)
if dow =='Sun':
return(7)
def num2dow(num):
if num ==1:
return('Mon')
if num ==2:
return('Tue')
if num ==3:
return('Wed')
if num ==4:
return('Thu')
if num ==5:
return('Fri')
if num ==6:
return('Sat')
if num ==7:
return('Sun')
# define function to compute distance between two points in sphere
# https://en.wikipedia.org/wiki/Haversine_formula
# https://math.stackexchange.com/questions/2978144/difference-between-haversine-and-euclidean-distance
from numpy import radians, cos, sin, arcsin, sqrt,pi
def haversine(lon, lat):
"""
Calculate the great circle distance between two points
on the earth (specified in decimal degrees)
"""
#Convert decimal degrees to Radians:
lon = np.radians(lon)
lat = np.radians(lat)
# handle vector data
lon1 = lon[0:len(lon)-1]
lon2 = lon[1:len(lon)]
lat1 = lat[0:len(lat)-1]
lat2 = lat[1:len(lat)]
#Implementing Haversine Formula:
dlon = (lon2-lon1)
dlat = (lat2-lat1)
a = np.add(
np.power(sin(np.divide(dlat, 2)), 2),
np.multiply(
np.power(sin(np.divide(dlon, 2)), 2),
np.multiply(
cos(lat1),
cos(lat2)
)
)
)
c = np.multiply(2, arcsin(sqrt(a)))
r = 6371
return np.concatenate((np.zeros((1,)),c*r),axis=0)
The distance between Big Ben in London (51.5007° N, 0.1246° W) and The Statue of Liberty in New York (40.6892° N, 74.0445° W) is 5574.8 km.
tmp = pd.DataFrame([[51.5007,0.1246],[40.6892,74.0445]],columns=["latitude","longitude"])
print(tmp)
print(haversine(tmp["longitude"].values,tmp["latitude"].values)[1])
latitude longitude 0 51.5007 0.1246 1 40.6892 74.0445 5574.840456848555
latitude longitude \ 0 51.5007 0.1246 \ 1 40.6892 74.0445 \ 5574.840456848555
# convert dow from str to int
dat['downum'] = list(map(dow2num,dat['dow'].values))
#convert time-stamp values to seconds
dat['time'] = [x.split(' ')[1] for x in dat['timestamp']]
dat['time_sec'] = [int(t.split(':')[0])*3600+int(t.split(':')[1])*60+int(t.split(':')[2]) for t in dat['time'].values]
# get unique data_id values
date_id = np.unique(dat['date_id'])
# inspect total number of unique data_ids
print(len(date_id))
#returns 32
32
for ix in range(len(date_id)):
tmp = dat.loc[dat['date_id'] == date_id[ix],].copy()
for u in np.unique(tmp['user']):
geo_pos = tmp.loc[tmp['user']==u,['longitude','latitude','time_sec']].copy()
# compute spherical distance
d = haversine(geo_pos.longitude.values,geo_pos.latitude.values)
dt = (geo_pos['time_sec'].diff())
# compute speed [km/s]
s = (d/dt).fillna(0.)
s[s==np.inf]=0
s[s==-np.inf]=0
geo_pos['Dist'] = d
geo_pos['Speed'] = s
tmp.loc[tmp['user']==u,'Dist'] = geo_pos['Dist']
tmp.loc[tmp['user']==u,'Speed'] = geo_pos['Speed']
dat.loc[dat['date_id'] == date_id[ix],'Dist'] = tmp['Dist']
dat.loc[dat['date_id'] == date_id[ix],'Speed'] = tmp['Speed']
# sanity-check: no NaNs should be left behind
print(any(dat['Dist'].isna()))
print(any(dat['Speed'].isna()))
print(any(dat['Dist'] == np.inf))
print(any(dat['Speed']== np.inf))
False False False False
plt.figure(figsize=(10,10))
plt.subplot(221)
plt.plot(dat['Dist'],'.')
plt.xlabel('Sample #')
plt.ylabel('Distance [km]')
plt.subplot(222)
plt.plot(dat['Speed'],'.')
plt.xlabel('Sample #')
plt.ylabel('Distance [km/s]')
Text(0, 0.5, 'Distance [km/s]')
# split df by user and date_id
dfs = dat.groupby(['user','date_id'])
dfs = [dfs.get_group(x) for x in dfs.groups]
print("There are n= "+str(len(dfs))+" measurements (user x date_id)")
# compute some stats
df_tmp = pd.DataFrame(columns=['date_id','total_time','user','dow'])
usr = []
for i in range(len(dfs)):
# extract unique date identifier
df_tmp.loc[i,'date_id'] = int(np.unique(dfs[i]['date_id'].values.flatten()))
# extract unique user identifier
df_tmp.loc[i,'user'] = int(np.unique(dfs[i]['user'].values.flatten()))
# extract unique dow identifier
df_tmp.loc[i,'dow'] = np.unique(dfs[i]['dow'].values)[0]
# compute total time in hrs of sample range per user and date_id
t = dfs[i]['utc_time_sec'].values.astype(int)
df_tmp.loc[i,'total_time'] = (max(t)-min(t))/60/60
# compute total number of samples collected per user and date_id
df_tmp.loc[i,'n_samples'] = int(len(dfs[i]['utc_time_sec'].values))
# inspect df
df_tmp
There are n= 121 measurements (user x date_id)
| date_id | total_time | user | dow | n_samples | |
|---|---|---|---|---|---|
| 0 | 20150301 | 7.84167 | 1 | Sun | 885.0 |
| 1 | 20150302 | 15.85 | 1 | Mon | 1363.0 |
| 2 | 20150303 | 10.7092 | 1 | Tue | 1019.0 |
| 3 | 20150304 | 15.7194 | 1 | Wed | 1049.0 |
| 4 | 20150305 | 13.8414 | 1 | Thu | 1084.0 |
| ... | ... | ... | ... | ... | ... |
| 116 | 20150328 | 22.4331 | 4 | Sat | 712.0 |
| 117 | 20150329 | 15.2294 | 4 | Sun | 759.0 |
| 118 | 20150330 | 14.8533 | 4 | Mon | 715.0 |
| 119 | 20150331 | 16.1544 | 4 | Tue | 735.0 |
| 120 | 20150401 | 0.209167 | 4 | Wed | 13.0 |
121 rows × 5 columns
# compute total distance traveled by user at specific dow
total_d={}
# split dat by user
dfs2 = dat.groupby('user')
dfs2 = [dfs2.get_group(x) for x in dfs2.groups]
#should return 4
print(len(dfs2))
# loop over user
for ix in range(len(dfs2)):
#get unique user identifier
usr = int(np.unique(dfs2[ix]['user'].values))
# further split df by dow
tmp = dfs2[ix].groupby('dow')
tmp = [tmp.get_group(x) for x in tmp.groups]
#should return 7
print(len(tmp))
# loop over dow
for ix2 in range(len(tmp)):
# get unique dow identifier
dow = str(np.unique(tmp[ix2]['dow'].values)[0])
# extract geoposition for specific user
x = tmp[ix2]['longitude'].values
y = tmp[ix2]['latitude'].values
# compute spherical distance
d = haversine(x,y)
# compute L2-norm
#d = np.linalg.norm([x,y],axis=0) #
if (list(total_d.keys()).count(dow)==0):
total_d[dow] = []
# total distance traveled by user at dow
total_d[dow].append([usr,sum(d)])
# inspect dict
total_d
4 7 7 7 7
{'Fri': [[1, 142.46806053310553],
[2, 234.46505202247823],
[3, 381.2464154792379],
[4, 1363.612592786522]],
'Mon': [[1, 619.0559660412223],
[2, 435.6762111105262],
[3, 534.9857393748978],
[4, 1081.6035055384127]],
'Sat': [[1, 262.2948897743413],
[2, 627.6330255315878],
[3, 549.4025914713986],
[4, 1232.488149360879]],
'Sun': [[1, 370.9955286482481],
[2, 458.03248568397584],
[3, 267.80836972373714],
[4, 2409.6560736057945]],
'Thu': [[1, 213.84787112477107],
[2, 255.48625426212675],
[3, 478.5380662543046],
[4, 1311.1406325530252]],
'Tue': [[1, 2120.938203844221],
[2, 394.35993002070785],
[3, 390.4372556516837],
[4, 1582.0491426834058]],
'Wed': [[1, 2802.9792762001744],
[2, 436.6047532923059],
[3, 486.5344551987476],
[4, 1008.0197288661864]]}
# make plots to visualize stats computed above
c = ['blue','orange','red','green']
m = ['^','o','.','x']
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(2,2,1)
for i in range(1,5):
ax1.scatter(df_tmp['date_id'].values[df_tmp['user']==i],df_tmp['total_time'].values[df_tmp['user']==i],color=c[i-1],marker=m[i-1])
ax1.legend(['usr1','usr2','usr3','usr4'])
ax1.set_xlabel('date_id')
ax1.set_ylabel('Total time [hrs]')
#ax2 = plt.subplot(2,2,2)
ax3 = plt.subplot(2,2,3)
g = df_tmp.groupby('dow')
g = [g.get_group(x) for x in g.groups]
dow=[]
for i in range(len(g)):
dow.append(np.unique(g[i]['dow'].values))
v = [g[i]['n_samples'].values[g[i]['user']==j].sum() for j in range(1,5)]
x = np.argsort(v)[::-1]
for j in range(len(x)):
plt.bar(i+1,v[x[j]],color=c[x[j]])
s_ix = np.argsort(list(map(dow2num,dow))).flatten()
ax3.set_xticks(list(range(1,8)))
ax3.set_xticklabels([dow[ix] for ix in s_ix])
ax3.set_ylabel('Number of samples dow')
ax4 = plt.subplot(2,2,4)
for k in total_d.keys():
x = dow2num(k)
v = total_d[k]
d = np.asarray(v)[:,1]
u = np.argsort(d)[::-1]
for i in range(len(u)):
plt.bar(x,d[u[i]],color=c[u[i]])
ax4.set_xticks(list(range(1,8)))
ax4.set_xticklabels([dow[ix] for ix in s_ix])
ax4.set_ylabel('Distance traveled [kms]')
plt.tight_layout()
# make plots to visualize trajectories of users for differente date-ids
fig = plt.figure(figsize=(20,20))
for ix in range(len(date_id)):
tmp = dat.loc[dat['date_id'] == date_id[ix],]
long = tmp['longitude'].values
lat = tmp['latitude'].values
ax = plt.subplot(6,7,ix+1)
ax.scatter(long[tmp['user']==1],lat[tmp['user']==1],color='blue',marker='v')
ax.scatter(long[tmp['user']==2],lat[tmp['user']==2],color='orange',marker='.')
ax.scatter(long[tmp['user']==3],lat[tmp['user']==3],color='red',marker='x')
ax.scatter(long[tmp['user']==4],lat[tmp['user']==4],color='green',marker='o')
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.title(np.unique(tmp['dow']))
if (ix==0):
plt.legend(['usr1','usr2','usr3','usr4'])
plt.tight_layout()
# make animation to visualize dynamic trajectory of user for specific date
import matplotlib
matplotlib.rcParams['animation.embed_limit'] = 2**128
from matplotlib import pyplot as plt
from celluloid import Camera
tmp = dat.loc[dat['user']==2,]
tmp = tmp.loc[tmp['date_id']==date_id[4],]
tmp = tmp.loc[tmp['accuracy']<500,]
print(tmp.shape)
long = tmp['longitude'].values
lat = tmp['latitude'].values
t = tmp['utc_time_sec'].values
# create figure object
fig = plt.figure()
# load axis box,set axis limit
ax = plt.axes(xlim=(np.min(long),np.max(long)),ylim=(np.min(lat),np.max(lat)))
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
camera = Camera(fig)
for i in range(len(long)):
ax.scatter(long[:i], lat[:i],color='blue')
camera.snap()
animation = camera.animate()
#animation.save('animation.gif', writer='PillowWriter', fps=2)
from IPython.display import HTML
HTML(animation.to_jshtml())
(2438, 14)